home *** CD-ROM | disk | FTP | other *** search
Wrap
/* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */ /* * misc.c: Miscellaneous routines used everywhere. */ #include "defs.h" #define MYEOF 0xffffffff /* * These are the global variables common to both compress/uncompress/csearch.c */ int RESERVED_CHARS; int MAX_WORDS; int SPECIAL_WORDS; int BEGIN_SPECIAL_WORDS; int END_SPECIAL_WORDS; int NUM_SPECIAL_DELIMITERS; int END_SPECIAL_DELIMITERS; int ONE_VERBATIM; int TC_FOUND_NOTBLANK; int TC_FOUND_BLANK; int usemalloc = 0; char special_texts[] = SPECIAL_TEXTS; char special_delimiters[] = SPECIAL_DELIMITERS; hash_entry freq_words_table[MAX_WORD_LEN+2][256]; /* 256 is the maximum possible number of special words */ char freq_words_strings[256][MAX_WORD_LEN+2]; int freq_words_lens[256]; set_usemalloc() { usemalloc = 1; } unset_usemalloc() { usemalloc = 0; } unsigned int mygetc(fp, buf, maxlen, lenp) FILE *fp; unsigned char *buf; int *lenp; { unsigned int c; if (fp != NULL) c = getc(fp); if (buf != NULL) { if (*lenp >= maxlen) return MYEOF; else c = (unsigned int)buf[*lenp]; } (*lenp) ++; return c; } myfpcopy(fp, src) FILE *fp; char *src; { int i=0; while(*src) { putc(*src, fp); src ++; i ++; } return i; } mystrcpy(dest, src) char *src, *dest; { int i=0; while(*dest = *src) { dest ++; src ++; i ++; } return i; } /* Returns 1 if little endian, 0 if big endian */ int get_endian() { union{ int x; struct { short y1; short y2; } y; } var; var.x = 0xffff0000; if (var.y.y1 == 0) return 1; else return 0; } /* * These procedures take care of the fact that the msb of the encoded * short cannot be < RESERVED_CHARS, and the lsb cannot be equal to '\n' or '\0'. */ unsigned char encode_msb(i) unsigned char i; { return i + RESERVED_CHARS; } unsigned char decode_msb(i) unsigned char i; { return i - RESERVED_CHARS; } unsigned char encode_lsb(i) unsigned char i; { if (i == '\0') return MAX_LSB; if (i == '\n') return MAX_LSB + 1; return i; } unsigned char decode_lsb(i) unsigned char i; { if (i == MAX_LSB) return '\0'; if (i == MAX_LSB + 1) return '\n'; return i; } unsigned short encode_index(i) unsigned short i; { unsigned char msb, lsb; msb = (i / MAX_LSB); lsb = (i % MAX_LSB); msb = encode_msb(msb); lsb = encode_lsb(lsb); return (msb << 8) | lsb; } unsigned short decode_index(i) unsigned short i; { unsigned char msb, lsb; msb = ((i & 0x0ff00) >> 8); lsb = (i & 0x00ff); msb = decode_msb(msb); lsb = decode_lsb(lsb); return (msb * MAX_LSB + lsb); } #if 0 /* This is bullshit */ unsigned short encode_index(i) unsigned short i; { unsigned int msb, lsb; top: msb = (i & 0xff00) >> 8; if ((i & 0x00ff) == '\n') { i = MAX_WORDS + msb; goto top; /* eliminate tail recursion */} lsb = (i & 0x00ff); msb += RESERVED_CHARS; return (0x0000ffff & ((msb << 8) | lsb)); } unsigned short decode_index(i) unsigned short i; { unsigned int msb, lsb, ret; msb = (i & 0xff00) >> 8; lsb = (i & 0x00ff); msb -= RESERVED_CHARS; ret = (0x0000ffff & ((msb << 8) | lsb)); if (ret >= MAX_WORDS) ret = (((ret - MAX_WORDS) << 8) | '\n'); return ret; } #endif /*0*/ char comp_signature[SIGNATURE_LEN]; /* SIGNATURE_LEN - 1 hex-chars terminated by '\0' */ /* returns the number of words read */ build_freq(freq_words_table, freq_words_strings, freq_words_lens, freq_file, flags) hash_entry freq_words_table[MAX_WORD_LEN+2][256]; char freq_words_strings[256][MAX_WORD_LEN+2]; int freq_words_lens[256]; char *freq_file; { FILE *fp = fopen(freq_file, "r"); int len, num, i, j; hash_entry *e; int numsofar = 0; int freq_words; memset(comp_signature, '\0', SIGNATURE_LEN); if (fp == NULL) { if (flags & TC_ERRORMSGS) { fprintf(stderr, "cannot open cast-dictionary file: %s\n", freq_file); fprintf(stderr, "(use -H to give a dictionary-dir or run 'buildcast' to make a dictionary)\n"); } return -1; } /* initialize the tables by accessing only those entries which will be used */ if (SIGNATURE_LEN != fread(comp_signature, 1, SIGNATURE_LEN, fp)) { if (flags & TC_ERRORMSGS) fprintf(stderr, "illegal cast signature in: %s\n", freq_file); fclose(fp); return -1; } comp_signature[SIGNATURE_LEN - 1] = '\0'; /* overwrite '\0' */ fscanf(fp, "%d\n", &freq_words); if ((freq_words < 0) || (freq_words > 256 - MAX_SPECIAL_CHARS)) { if (flags & TC_ERRORMSGS) fprintf(stderr, "illegal number of frequent words %d outside [0, %d] in: %s\n", freq_words, 256-MAX_SPECIAL_CHARS, freq_file); fclose(fp); return -1; } if (freq_words == 0) { fclose(fp); return 0; } for (i=0; i<=MAX_WORD_LEN; i++) { for (j=0; j<freq_words; j++) freq_words_table[i][j].val.offset = -1; } memset(freq_words_lens, '\0', sizeof(int)*freq_words); for (i=0; i<freq_words; i++) { freq_words_strings[i][0] = '\0'; } /* Refer to read_in.c for the format in which these words are dumped */ while (2 == fscanf(fp, "%d %d\n", &len, &num)) { for(i=0; i<num; i++) { e = &(freq_words_table[len][i]); e->word = &(freq_words_strings[numsofar + i][0]); if (1 != fscanf(fp, "%s\n", e->word)) { fclose(fp); return numsofar; } freq_words_lens[numsofar + i] = len; e->val.offset = numsofar + i; /* which-th special word is it? */ if (i + 1 == num) e->next = NULL; else e->next = &(freq_words_table[len][i+1]); } numsofar += num; } fclose(fp); return numsofar; } int initialize_common_done = 0; /* Used in tcomp.c, tuncomp.c and csearch.c */ initialize_common(freq_file, flags) char *freq_file; int flags; { if (initialize_common_done == 1) return 1; if ((SPECIAL_WORDS = build_freq(freq_words_table, freq_words_strings, freq_words_lens, freq_file, flags)) == -1) return 0; BEGIN_SPECIAL_WORDS = MAX_SPECIAL_CHARS; RESERVED_CHARS = END_SPECIAL_WORDS = BEGIN_SPECIAL_WORDS + SPECIAL_WORDS; MAX_WORDS = MAX_LSB*(256-RESERVED_CHARS); /* upper byte must be > RESERVED_CHARS, lower byte must not be '\n' */ TC_FOUND_NOTBLANK = 0; TC_FOUND_BLANK = 0; initialize_common_done = 1; return 1; } uninitialize_common() { initialize_common_done = 0; return; } /* * Simple O(worlen*linelen) search since the average linelen is * guaranteed to be ~ 80/2, and the average wordlen, 2. * SHOULD WORK FOR ANY LEGITIMATE COMPRESSED STRING WITH EASY SEARCH */ int exists_tcompressed_word(word, wordlen, line, linelen, flags) CHAR *word, *line; int wordlen, linelen; { int i, j; #if 0 for (i=0; i<linelen; i++) printf("%d ", line[i]); printf("\n"); for (i=0; i<wordlen; i++) printf("%d ", word[i]); printf("\n"); #endif /*0*/ if (wordlen > linelen) return -1; if (flags & TC_EASYSEARCH) { for (i=0; i<=linelen-wordlen; i++) { if (word[0] == BEGIN_VERBATIM) while ((i <= linelen - wordlen) && (line[i] != BEGIN_VERBATIM)) i++; j = 0; while ((j < wordlen) && (i <= linelen - wordlen) && (word[j] == line[i+j])) j++; if (j >= wordlen) return i; if (i > linelen - wordlen) return -1; /* Goto next-pos for i. Remember: the for loop ALSO skips over one i */ if (line[i] >= RESERVED_CHARS) i++; else if (line[i] == BEGIN_VERBATIM) { i++; while ((i <= linelen - wordlen) && (line[i] != END_VERBATIM)) i++; if (i > linelen - wordlen) return -1; } else if (line[i] == EASY_ONE_VERBATIM) i++; } } else { for (i=0; i<=linelen-wordlen; i++) { if (word[0] == BEGIN_VERBATIM) while ((i <= linelen - wordlen) && (line[i] != BEGIN_VERBATIM)) i++; j = 0; while ((j < wordlen) && (i <= linelen - wordlen) && (word[j] == line[i+j])) j++; if (j >= wordlen) return i; if (i > linelen - wordlen) return -1; /* Goto next-pos for i. Remember: the for loop ALSO skips over one i */ if (line[i] >= RESERVED_CHARS) i++; else if (line[i] == BEGIN_VERBATIM) { i++; while ((i <= linelen - wordlen) && (line[i] != BEGIN_VERBATIM) && (line[i] != END_VERBATIM)) i++; if (i > linelen - wordlen) return -1; if (line[i] == BEGIN_VERBATIM) i--; /* counter-act the i++ */ } } } return -1; } /* * There is a problem here if we use these two routines to search for delimiters: * With outtail set, the implicit blank AFTER the word just before the beginning * of the record and a possible NOTBLANK after the end of the record might be missed. * No way to rectify it now unless we have flags to indicate if these things occured. * That is why, I have introduced TC_FOUND_NOTBLANK and TC_FOUND_BLANK. */ /* return where the word begins or ends (=outtail): range = [begin, end) */ unsigned char * forward_tcompressed_word(begin, end, delim, len, outtail, flags) unsigned char *begin, *end, *delim; int len, outtail, flags; { register unsigned char *curend; register int pos; TC_FOUND_NOTBLANK = 0; if (begin + len > end) return end + 1; curend = begin; top: while ((curend <= end) && (*curend != '\n')) curend ++; if ((pos = exists_tcompressed_word(delim, len, begin, curend-begin, flags)) == -1) { curend ++; /* for next '\n' */ if (curend > end) return end + 1; begin = curend; goto top; } begin += pos; /* place where delimiter begins */ if (outtail) { TC_FOUND_NOTBLANK = 1; return begin + len; } else return begin; } /* return where the word begins or ends (=outtail): range = [begin, end) */ unsigned char * backward_tcompressed_word(end, begin, delim, len, outtail, flags) unsigned char *begin, *end, *delim; int len, outtail, flags; { register unsigned char *curbegin; register int pos; TC_FOUND_BLANK = 0; if (begin + len > end) return begin; curbegin = end; top: while ((curbegin > begin) && (*curbegin != '\n')) curbegin --; if ((pos = exists_tcompressed_word(delim, len, curbegin, end-curbegin, flags)) == -1) { curbegin --; /* for next '\n' */ if (curbegin < begin) return begin; end = curbegin; goto top; } curbegin += pos; /* place where delimiter begins */ if (outtail) { if ((curbegin + len < end) && (*(curbegin + len) != NOTBLANK)) TC_FOUND_BLANK = 1; return curbegin + len; } else return curbegin; }